# --- Packages ---
library(tidyverse)
library(readxl)

# --- Load data & light cleaning ---
load(file = "raw_data/df_zhengfawei.RData")

# dataset 1: Provincial Political and Legal Affairs Commission for first outbreak
# --- date window & exclusions ---
date_start <- as.Date("2018-12-31")
date_end   <- as.Date("2020-12-31")
excluded_accounts <- c("广东政法", "贵州长安网", "长安兵团", "长安河北", "陕西政法", "中央政法委长安剑")

# Keep only the needed window early; normalize account/text
df <- df_zhengfawei |>
  mutate(
    account = sub("_20.*", "", account),
    text = gsub("\n", "", text)
  ) |>
  filter(date >= date_start, date <= date_end)

# --- Read account↔province mapping ---
account_province <- read_xlsx("raw_data/account_name.xlsx")

# --- Subset: grid-related posts and attach province info ---
# Keep posts mentioning "网格", but exclude "网格线" and "黄色网格"
data_grid <- df |>
  filter(str_detect(text, "网格"),
         !str_detect(text, "网格线|黄色网格")) |>
  rename(shengfen = province) |>
  left_join(account_province, by = "account")

# --- Apply sample exclusions (5 provinces & central account) ---
total_articles <- df |>
  filter(!account %in% excluded_accounts)

grid_articles <- data_grid |>
  filter(!account %in% excluded_accounts)

# --- Daily counts per account (total vs. grid) ---
count_total <- total_articles |>
  count(account, date, name = "count") |>
  arrange(account, date)

count_grid <- grid_articles |>
  count(account, date, name = "count") |>
  arrange(account, date)

# --- Aggregate to daily series and join ---
daily_first_outbreak <- count_total |>
  count(date, wt = count, name = "total_count") |>
  left_join(
    count_grid |> count(date, wt = count, name = "grid_count"),
    by = "date"
  ) |>
  mutate(grid_count = replace_na(grid_count, 0L))

# --- Save output ---
save(daily_first_outbreak, file = "data/daily_first_outbreak.RData")

# dataset 2: 5 provinces + Jiangsu + central account”
excluded_accounts2 <- c("广东政法", "贵州长安网", "长安兵团", "长安河北","陕西政法", "江苏政法", "中央政法委长安剑")

# --- Apply sample exclusions (5 provinces & central account) ---
total_articles2 <- df |>
  filter(!account %in% excluded_accounts2)

grid_articles2 <- data_grid |>
  filter(!account %in% excluded_accounts2)

# --- Daily counts per account (total vs. grid) ---
count_total2 <- total_articles2 |>
  count(account, date, name = "count") |>
  arrange(account, date)

count_grid2 <- grid_articles2 |>
  count(account, date, name = "count") |>
  arrange(account, date)

# --- Aggregate to daily series and join ---
daily_first_outbreak2 <- count_total2 |>
  count(date, wt = count, name = "total_count") |>
  left_join(
    count_grid2 |> count(date, wt = count, name = "grid_count"),
    by = "date"
  ) |>
  mutate(grid_count = replace_na(grid_count, 0L))

# --- Save output ---
save(daily_first_outbreak2, file = "data/daily_first_outbreak_robust.RData")

# dataset 3: data for testing on the period of Dynamic Zero Covid
# municipal releases
load("raw_data/shijifabu.RData")

daily_total_count <- shijifabu |>
  mutate(date = as.Date(date)) |>
  group_by(date, City) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(date, City)

daily_grid_count <- shijifabu |>
  filter(
    grepl("网格", text) &
      !grepl("网格线", text) &
      !grepl("黄色网格", text)
  ) |>
  mutate(date = as.Date(date)) |>
  group_by(date, City) |>
  summarise(daily_grid = n(), .groups = "drop") |>
  arrange(date, City)

shijifabu <- full_join(
  daily_total_count,
  daily_grid_count,
  by = c("date", "City")
) |>
  mutate(
    total_count = replace_na(total_count, 0L),
    daily_grid  = replace_na(daily_grid, 0L)
  ) |>
  arrange(date, City)

save(shijifabu, file = "data/shijifabu.RData")

# municipal Political and Legal Affairs Committee

load("raw_data/shijizhengfa.RData")

daily_total_count <- shijizhengfa |>
  mutate(date = as.Date(date)) |>
  group_by(date, City) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(date, City)

daily_grid_count <- shijizhengfa |>
  filter(
    grepl("网格", text) & !grepl("网格线", text) & !grepl("黄色网格", text) &
      !grepl("招聘", text) & !grepl("培训", text) & !grepl("入职", text) & !grepl("聘用", text)
  ) |>
  mutate(date = as.Date(date)) |>
  group_by(date, City) |>
  summarise(daily_grid = n(), .groups = "drop") |>
  arrange(date, City)

shijizhengfa <- full_join(
  daily_total_count,
  daily_grid_count,
  by = c("date", "City")
) |>
  mutate(
    total_count = replace_na(total_count, 0L),
    daily_grid  = replace_na(daily_grid, 0L)
  ) |>
  arrange(date, City)

save(shijizhengfa, file = "data/shijizhengfa.RData")

# provincial releases

load("raw_data/shengjifabu.RData")

daily_total_count <- shengjifabu |>
  mutate(date = as.Date(date)) |>
  group_by(date, province) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(date, province)

daily_grid_count <- shengjifabu |>
  filter(
    grepl("网格", text) &
      !grepl("网格线", text) &
      !grepl("黄色网格", text)
  ) |>
  mutate(date = as.Date(date)) |>
  group_by(date, province) |>
  summarise(daily_grid = n(), .groups = "drop") |>
  arrange(date, province)

shengjifabu <- full_join(
  daily_total_count,
  daily_grid_count,
  by = c("date", "province")
) |>
  mutate(
    total_count = replace_na(total_count, 0L),
    daily_grid  = replace_na(daily_grid, 0L)
  ) |>
  arrange(date, province)

save(shengjifabu, file = "data/shengjifabu.RData")

# provincial Political and Legal Affairs Committee
load("raw_data/shengjizhengfa.RData")

shengjizhengfa <- shengjizhengfa|>
  filter(province != "兵团")

daily_total_count <- shengjizhengfa |>
  mutate(date = as.Date(date)) |>
  group_by(date, province) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(date, province)

daily_grid_count <- shengjizhengfa |>
  filter(
    grepl("网格", text) & !grepl("网格线", text) & !grepl("黄色网格", text) &
      !grepl("招聘", text) & !grepl("培训", text) & !grepl("入职", text) & !grepl("聘用", text)
  ) |>
  mutate(date = as.Date(date)) |>
  group_by(date, province) |>
  summarise(daily_grid = n(), .groups = "drop") |>
  arrange(date, province)

shengjizhengfa <- full_join(
  daily_total_count,
  daily_grid_count,
  by = c("date", "province")
) |>
  mutate(
    total_count = replace_na(total_count, 0L),
    daily_grid  = replace_na(daily_grid, 0L)
  ) |>
  arrange(date, province)

save(shengjizhengfa, file = "data/shengjizhengfa.RData")
